diff --git a/benches/bigint.rs b/benches/bigint.rs
index 960b7cee..80ec191c 100644
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@@ -174,35 +174,40 @@ fn fib_to_string(b: &mut Bencher) {
     b.iter(|| fib.to_string());
 }
 
-fn to_str_radix_bench(b: &mut Bencher, radix: u32) {
+fn to_str_radix_bench(b: &mut Bencher, radix: u32, bits: u64) {
     let mut rng = get_rng();
-    let x = rng.gen_bigint(1009);
+    let x = rng.gen_bigint(bits);
     b.iter(|| x.to_str_radix(radix));
 }
 
 #[bench]
 fn to_str_radix_02(b: &mut Bencher) {
-    to_str_radix_bench(b, 2);
+    to_str_radix_bench(b, 2, 1009);
 }
 
 #[bench]
 fn to_str_radix_08(b: &mut Bencher) {
-    to_str_radix_bench(b, 8);
+    to_str_radix_bench(b, 8, 1009);
 }
 
 #[bench]
 fn to_str_radix_10(b: &mut Bencher) {
-    to_str_radix_bench(b, 10);
+    to_str_radix_bench(b, 10, 1009);
+}
+
+#[bench]
+fn to_str_radix_10_2(b: &mut Bencher) {
+    to_str_radix_bench(b, 10, 10009);
 }
 
 #[bench]
 fn to_str_radix_16(b: &mut Bencher) {
-    to_str_radix_bench(b, 16);
+    to_str_radix_bench(b, 16, 1009);
 }
 
 #[bench]
 fn to_str_radix_36(b: &mut Bencher) {
-    to_str_radix_bench(b, 36);
+    to_str_radix_bench(b, 36, 1009);
 }
 
 fn from_str_radix_bench(b: &mut Bencher, radix: u32) {
diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs
index 278ec782..edeed84c 100644
--- a/src/biguint/convert.rs
+++ b/src/biguint/convert.rs
@@ -15,7 +15,7 @@ use core::cmp::Ordering::{Equal, Greater, Less};
 use core::convert::TryFrom;
 use core::mem;
 use core::str::FromStr;
-use num_integer::Integer;
+use num_integer::{Integer, Roots};
 use num_traits::float::FloatCore;
 use num_traits::{FromPrimitive, Num, PrimInt, ToPrimitive, Zero};
 
@@ -665,6 +665,39 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
     let (base, power) = get_radix_base(radix, big_digit::HALF_BITS);
     let radix = radix as BigDigit;
 
+    // For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
+    // performance. We can mitigate this by dividing into chunks of a larger base first.
+    // The threshold for this was chosen by anecdotal performance measurements to
+    // approximate where this starts to make a noticeable difference.
+    if digits.data.len() >= 64 {
+        let mut big_base = BigUint::from(base * base);
+        let mut big_power = 2usize;
+
+        // Choose a target base length near √n.
+        let target_len = digits.data.len().sqrt();
+        while big_base.data.len() < target_len {
+            big_base = &big_base * &big_base;
+            big_power *= 2;
+        }
+
+        // This outer loop will run approximately √n times.
+        while digits > big_base {
+            // This is still the dominating factor, with n digits divided by √n digits.
+            let (q, mut big_r) = digits.div_rem(&big_base);
+            digits = q;
+
+            // This inner loop now has O(√n²)=O(n) behavior altogether.
+            for _ in 0..big_power {
+                let (q, mut r) = div_rem_digit(big_r, base);
+                big_r = q;
+                for _ in 0..power {
+                    res.push((r % radix) as u8);
+                    r /= radix;
+                }
+            }
+        }
+    }
+
     while digits.data.len() > 1 {
         let (q, mut r) = div_rem_digit(digits, base);
         for _ in 0..power {
diff --git a/src/biguint/division.rs b/src/biguint/division.rs
index 343705e1..b5d4259c 100644
--- a/src/biguint/division.rs
+++ b/src/biguint/division.rs
@@ -1,7 +1,7 @@
 use super::addition::__add2;
 #[cfg(not(u64_digit))]
 use super::u32_to_u128;
-use super::BigUint;
+use super::{cmp_slice, BigUint};
 
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 use crate::UsizePromotion;
@@ -153,14 +153,14 @@ fn div_rem(mut u: BigUint, mut d: BigUint) -> (BigUint, BigUint) {
     //
     let shift = d.data.last().unwrap().leading_zeros() as usize;
 
-    let (q, r) = if shift == 0 {
+    if shift == 0 {
         // no need to clone d
-        div_rem_core(u, &d)
+        div_rem_core(u, &d.data)
     } else {
-        div_rem_core(u << shift, &(d << shift))
-    };
-    // renormalize the remainder
-    (q, r >> shift)
+        let (q, r) = div_rem_core(u << shift, &(d << shift).data);
+        // renormalize the remainder
+        (q, r >> shift)
+    }
 }
 
 pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
@@ -195,24 +195,21 @@ pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
     //
     let shift = d.data.last().unwrap().leading_zeros() as usize;
 
-    let (q, r) = if shift == 0 {
+    if shift == 0 {
         // no need to clone d
-        div_rem_core(u.clone(), d)
+        div_rem_core(u.clone(), &d.data)
     } else {
-        div_rem_core(u << shift, &(d << shift))
-    };
-    // renormalize the remainder
-    (q, r >> shift)
+        let (q, r) = div_rem_core(u << shift, &(d << shift).data);
+        // renormalize the remainder
+        (q, r >> shift)
+    }
 }
 
 /// An implementation of the base division algorithm.
 /// Knuth, TAOCP vol 2 section 4.3.1, algorithm D, with an improvement from exercises 19-21.
-fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
-    debug_assert!(
-        a.data.len() >= b.data.len()
-            && b.data.len() > 1
-            && b.data.last().unwrap().leading_zeros() == 0
-    );
+fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
+    debug_assert!(a.data.len() >= b.len() && b.len() > 1);
+    debug_assert!(b.last().unwrap().leading_zeros() == 0);
 
     // The algorithm works by incrementally calculating "guesses", q0, for the next digit of the
     // quotient. Once we have any number q0 such that (q0 << j) * b <= a, we can set
@@ -235,16 +232,16 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
     let mut a0 = 0;
 
     // [b1, b0] are the two most significant digits of the divisor. They never change.
-    let b0 = *b.data.last().unwrap();
-    let b1 = b.data[b.data.len() - 2];
+    let b0 = *b.last().unwrap();
+    let b1 = b[b.len() - 2];
 
-    let q_len = a.data.len() - b.data.len() + 1;
+    let q_len = a.data.len() - b.len() + 1;
     let mut q = BigUint {
         data: vec![0; q_len],
     };
 
     for j in (0..q_len).rev() {
-        debug_assert!(a.data.len() == b.data.len() + j);
+        debug_assert!(a.data.len() == b.len() + j);
 
         let a1 = *a.data.last().unwrap();
         let a2 = a.data[a.data.len() - 2];
@@ -280,11 +277,11 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
         // q0 is now either the correct quotient digit, or in rare cases 1 too large.
         // Subtract (q0 << j) from a. This may overflow, in which case we will have to correct.
 
-        let mut borrow = sub_mul_digit_same_len(&mut a.data[j..], &b.data, q0);
+        let mut borrow = sub_mul_digit_same_len(&mut a.data[j..], b, q0);
         if borrow > a0 {
             // q0 is too large. We need to add back one multiple of b.
             q0 -= 1;
-            borrow -= __add2(&mut a.data[j..], &b.data);
+            borrow -= __add2(&mut a.data[j..], b);
         }
         // The top digit of a, stored in a0, has now been zeroed.
         debug_assert!(borrow == a0);
@@ -298,7 +295,7 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
     a.data.push(a0);
     a.normalize();
 
-    debug_assert!(a < *b);
+    debug_assert_eq!(cmp_slice(&a.data, b), Less);
 
     (q.normalized(), a)
 }
diff --git a/tests/biguint.rs b/tests/biguint.rs
index 716bcc81..5a5979c4 100644
--- a/tests/biguint.rs
+++ b/tests/biguint.rs
@@ -1600,6 +1600,16 @@ fn test_all_str_radix() {
     }
 }
 
+#[test]
+fn test_big_str() {
+    for n in 2..=20_u32 {
+        let x: BigUint = BigUint::from(n).pow(10_000_u32);
+        let s = x.to_string();
+        let y: BigUint = s.parse().unwrap();
+        assert_eq!(x, y);
+    }
+}
+
 #[test]
 fn test_lower_hex() {
     let a = BigUint::parse_bytes(b"A", 16).unwrap();