In [16]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import json
import pandas as pd

select_year = 2021
Firm = pd.read_csv("Firm.csv", index_col=0)
select_firm = Firm.loc[Firm["Source"]=="marketwatch",:"Type_Region"]
select_firm["Exchange"] = None
select_firm["Crawled_Name"] = None
select_firm["Stock_Currency"] = None

for code, row in select_firm.iterrows():
    try:
        url = f"https://www.marketwatch.com/investing/stock/{row['Stock_Code']}/financials/balance-sheet"
        if row['Stock_Region'] != "US":
            url += f"?countrycode={row['Stock_Region']}"
        html = urlopen(url)
        time.sleep(0.1)
        bsObj = BeautifulSoup(html.read(), 'html.parser')
    except Exception as e:
        print(e)
        print("Can't open url.")
    else:
        basic_info = bsObj.find(name="script", attrs={'type':"application/ld+json"}).contents[0]
        basic_info = json.loads(basic_info.replace('\n','').replace('\t',''))
        select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Exchange"] = basic_info["exchange"]
        select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Crawled_Name"] = basic_info["name"]
        select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Stock_Currency"] = basic_info["priceCurrency"]

        bs_table = bsObj.find(name="table", attrs={"aria-label":"Financials - Assets data table"})
        if bs_table is not None:
            bs = bs_table.find(name="tr", attrs={"class":"table__row"}).find_all(lambda tag: tag.name == 'th' and 
                                            tag.get('class') == ['overflow__heading'])
            years_list = [i.div.contents[0] for i in bs][0:-1]
            years_list = list(map(int, years_list))
            bs = bs_table.find_all(name="tr", attrs={"class":"table__row is-highlighted"})
            for i in bs:
                if i.td.div.contents[0] == 'Total Assets':
                    string = i.find("div", attrs={"class":"chart--financials js-financial-chart"}).attrs['data-chart-data']
                    total_assets_list = string.split(',')
                    total_assets_list = list(map(lambda x: float(x) if x!='' else 0, total_assets_list))
            total_assets = pd.DataFrame({"year":years_list,'total_asset':total_assets_list})
            select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Assets"] = total_assets.loc[total_assets['year']==select_year,'total_asset'].to_list()[0]
    try:
        url = f"https://www.marketwatch.com/investing/stock/{row['Stock_Code']}/financials"
        if row['Stock_Region'] != "US":
            url += f"?countrycode={row['Stock_Region']}"
        html = urlopen(url)
        time.sleep(0.1)
        bsObj = BeautifulSoup(html.read(), 'html.parser')
    except Exception as e:
        print(e)
        print("Can't open url.")
    else:
        basic_info = bsObj.find(name="script", attrs={'type':"application/ld+json"}).contents[0]
        basic_info = json.loads(basic_info.replace('\n','').replace('\t',''))
        select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Exchange"] = basic_info["exchange"]
        select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Crawled_Name"] = basic_info["name"]
        select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Stock_Currency"] = basic_info["priceCurrency"]

        bs_table = bsObj.find(name="table", attrs={"aria-label":"Financials - data table"})
        if bs_table is not None:
            bs = bs_table.find(name="tr", attrs={"class":"table__row"}).find_all(lambda tag: tag.name == 'th' and 
                                            tag.get('class') == ['overflow__heading'])
            years_list = [i.div.contents[0] for i in bs][0:-1]
            years_list = list(map(int, years_list))
            bs = bs_table.find_all(name="tr", attrs={"class":"table__row is-highlighted"})
            for i in bs:
                if i.td.div.contents[0] == 'Sales/Revenue':
                    string = i.find("div", attrs={"class":"chart--financials js-financial-chart"}).attrs['data-chart-data']
                    total_revenue_list = string.split(',')
                    total_revenue_list = list(map(lambda x: float(x) if x!='' else 0, total_revenue_list))
            total_revenue = pd.DataFrame({"year":years_list,'total_revenue':total_revenue_list})
            select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Revenue"] = total_revenue.loc[total_revenue['year']==select_year,'total_revenue'].to_list()[0]
        print(select_firm.loc[select_firm["Stock_Code"]==row['Stock_Code'], "Crawled_Name"].to_list())

select_firm['Exchange_Rate'] = select_firm['Stock_Currency'].map({'EUR': 7.2505, 'HKD': 0.88, 'JPY': 0.051, 'SEK': 0.65, 'USD': 6.88})
select_firm['Assets_zh'] = select_firm['Assets'] * select_firm['Exchange_Rate'] * 0.00000001
select_firm['Revenue_zh'] = select_firm['Revenue'] * select_firm['Exchange_Rate'] * 0.00000001
select_firm.to_csv('MarketWatch.csv', index=False, encoding='utf-8-sig')

['Dassault Systemes SE']
['Schneider Electric SE']
['China Electronics Huada Technology Co. Ltd.']
['Kingsoft Cloud Holdings Ltd. ADR']
['Baidu Inc.']
['Kingdee International Software Group Co. Ltd.']
['Tencent Holdings Ltd.']
['Xiaomi Corp.']
['AsiaInfo Technologies Ltd.']
['Fanuc Corp.']
['Omron Corp.']
['Mitsubishi Electric Corp.']
['Hexagon AB Series B']
['Hollysys Automation Technologies Ltd.']
['JD.com Inc. ADR']
['Autodesk Inc.']
['Altair Engineering Inc. Cl A']
['Ansys Inc.']
['Honeywell International Inc.']
['PTC Inc.']
['Texas Instruments Inc.']
['Cadence Design Systems Inc.']
['Cisco Systems Inc.']
['Microsoft Corp.']
['Analog Devices Inc.']
['Amazon.com Inc.']
['Intel Corp.']
['ABB Ltd. ADR']
['International Business Machines Corp.']
['Oracle Corp.']
['Salesforce Inc.']
['SAP SE ADR']
['Emerson Electric Co.']
['Dell Technologies Inc. Cl C']
['Hewlett Packard Enterprise Co.']
['Rockwell Automation Inc.']
['General Electric Co.']
['Synopsys Inc.']
['STMicroelectronics N.V.']
