99
1010HEADERS = {
1111 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' ,
12- "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
12+ "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ,
1313}
1414
1515PROCESS_ERRORS = (AttributeError , KeyError , IndexError , TypeError )
@@ -52,6 +52,31 @@ def mutate_url(url):
5252 return mutate_results
5353
5454
55+ def transform (scheme_data , extracted_data ):
56+ transforms = scheme_data .get ('transforms' , [])
57+ if transforms :
58+ for t in transforms :
59+ logging .debug (t )
60+ try :
61+ extracted_data = t (extracted_data )
62+ except PROCESS_ERRORS as e :
63+ logging .debug (f'Transform error: { e } ' )
64+ extracted_data = {}
65+ logging .debug (extracted_data )
66+ return extracted_data
67+
68+
69+ def map_fields (scheme_data , transformed_data ):
70+ values = {}
71+ for name , get_field in scheme_data ['fields' ].items ():
72+ try :
73+ value = get_field (transformed_data )
74+ values [name ] = str (value ) if value not in (None , [], {}) else ''
75+ except PROCESS_ERRORS as e :
76+ logging .debug (f'Unable to extact field { name } : { e } ' )
77+ return values
78+
79+
5580def extract (page ):
5681 for scheme_name , scheme_data in schemes .items ():
5782 flags = scheme_data ['flags' ]
@@ -80,21 +105,11 @@ def extract(page):
80105
81106 if scheme_data .get ('extract_json' , False ):
82107 extracted = regexp_group .group (1 )
83-
84108 logging .debug ('Extracted: %s' , extracted )
85109
86- transforms = scheme_data .get ('transforms' , [])
87- if transforms :
88- for t in transforms :
89- logging .debug (t )
90- try :
91- extracted = t (extracted )
92- except PROCESS_ERRORS as e :
93- logging .debug (f'Transform error: { e } ' )
94- extracted = {}
95- logging .debug (extracted )
110+ transformed = transform (scheme_data , extracted )
96111
97- json_data = json .loads (extracted )
112+ json_data = json .loads (transformed )
98113
99114 if json_data == {}:
100115 logging .debug ('Unabled to extract json!' )
@@ -107,14 +122,17 @@ def extract(page):
107122 with open ('debug_extracted.json' , 'w' ) as f :
108123 f .write (loaded_json_str )
109124
110- for name , get_field in scheme_data ['fields' ].items ():
111- try :
112- value = get_field (json_data )
113- values [name ] = str (value ) if value not in (None , [], {}) else ''
114- except PROCESS_ERRORS as e :
115- logging .debug (f'Unable to extact field { name } : { e } ' )
125+ values = map_fields (scheme_data , json_data )
116126 else :
117- values = regexp_group .groupdict ()
127+ groupdict = regexp_group .groupdict ()
128+ if groupdict :
129+ values = groupdict
130+ else :
131+ extracted = regexp_group .group (1 )
132+ logging .debug ('Extracted: %s' , extracted )
133+
134+ transformed_data = transform (scheme_data , extracted )
135+ values = map_fields (scheme_data , transformed_data )
118136
119137 if use_html_parser :
120138 soup = bs (page , 'html.parser' )
0 commit comments