@@ -97,9 +97,20 @@ def _process_problem_data(self, question):
9797 }
9898
9999 # Process content with BeautifulSoup to extract description, examples, and constraints
100+ import os
100101 content_html = question .get ('content' , '' )
102+ debug_dir = os .path .dirname (os .path .abspath (__file__ ))
103+ debug_content_path = os .path .join (debug_dir , 'debug_content_html.txt' )
104+ debug_soup_path = os .path .join (debug_dir , 'debug_soup.html' )
105+ # Write content_html to a debug file for inspection
106+ with open (debug_content_path , 'w' , encoding = 'utf-8' ) as f :
107+ f .write (content_html )
108+ print (f"[DEBUG] Wrote content_html to { debug_content_path } " )
101109 soup = BeautifulSoup (content_html , 'html.parser' )
102-
110+ # Write soup prettified HTML to a debug file for inspection
111+ with open (debug_soup_path , 'w' , encoding = 'utf-8' ) as f :
112+ f .write (soup .prettify ())
113+ print (f"[DEBUG] Wrote soup HTML to { debug_soup_path } " )
103114 # Get description (text before the first <strong>Example</strong>)
104115 description = []
105116 current_element = soup .find ()
@@ -113,14 +124,23 @@ def _process_problem_data(self, question):
113124
114125 problem_data ['description' ] = '\n ' .join ([d for d in description if d ])
115126
116- # Extract examples
127+ # Extract examples and attach the closest preceding image to each
117128 examples = []
118129 example_blocks = soup .find_all ('pre' )
119130 for i , example in enumerate (example_blocks , 1 ):
120- examples . append ( {
131+ example_dict = {
121132 'example_num' : i ,
122- 'example_text' : example .get_text ().strip ()
123- })
133+ 'example_text' : example .get_text ().strip (),
134+ 'images' : []
135+ }
136+ # Find the closest preceding <img> tag before this <pre>
137+ prev = example .previous_element
138+ while prev :
139+ if getattr (prev , 'name' , None ) == 'img' and prev .has_attr ('src' ):
140+ example_dict ['images' ].append (prev ['src' ])
141+ break
142+ prev = prev .previous_element
143+ examples .append (example_dict )
124144 problem_data ['examples' ] = examples
125145
126146 # Extract constraints
@@ -212,18 +232,22 @@ def scrape_problem_list(self, limit=10):
212232
213233 return problem_list
214234
215- if __name__ == "__main__" :
216- scraper = LeetCodeScraper ()
235+ # if __name__ == "__main__":
236+ # scraper = LeetCodeScraper()
217237
218238 # Option 1: Scrape a specific problem
219239 # problem_data = scraper.scrape_problem("two-sum")
220240 # print(json.dumps(problem_data, indent=2))
221241
242+ if __name__ == "__main__" :
243+ scraper = LeetCodeScraper ()
244+ problem_data = scraper .scrape_problem ("linked-list-cycle" )
245+ print (json .dumps (problem_data , indent = 2 ))
222246 # Option 2: Scrape multiple problems from the list
223- problem_list = scraper .scrape_problem_list (limit = 5 )
247+ # problem_list = scraper.scrape_problem_list(limit=5)
224248
225- # Add a delay between requests to avoid being blocked
226- for problem in problem_list :
227- print (f"Scraping problem: { problem ['title' ]} ({ problem ['slug' ]} )" )
228- scraper .scrape_problem (problem ['slug' ])
229- time .sleep (2 ) # Wait 2 seconds between requests
249+ # # Add a delay between requests to avoid being blocked
250+ # for problem in problem_list:
251+ # print(f"Scraping problem: {problem['title']} ({problem['slug']})")
252+ # scraper.scrape_problem(problem['slug'])
253+ # time.sleep(2) # Wait 2 seconds between requests
0 commit comments